{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = []\n",
    "with open('./dataset/seeds_dataset.txt', 'r', encoding='utf8') as file:\n",
    "    for line in file.readlines():\n",
    "        line_data = list(map(lambda num: float(num), line.split()))\n",
    "        line_data[-1] = int(line_data[-1]) - 1\n",
    "        data.append(line_data)\n",
    "data = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>210.000000</td>\n",
       "      <td>210.000000</td>\n",
       "      <td>210.000000</td>\n",
       "      <td>210.000000</td>\n",
       "      <td>210.000000</td>\n",
       "      <td>210.000000</td>\n",
       "      <td>210.000000</td>\n",
       "      <td>210.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>14.847524</td>\n",
       "      <td>14.559286</td>\n",
       "      <td>0.870999</td>\n",
       "      <td>5.628533</td>\n",
       "      <td>3.258605</td>\n",
       "      <td>3.700201</td>\n",
       "      <td>5.408071</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.909699</td>\n",
       "      <td>1.305959</td>\n",
       "      <td>0.023629</td>\n",
       "      <td>0.443063</td>\n",
       "      <td>0.377714</td>\n",
       "      <td>1.503557</td>\n",
       "      <td>0.491480</td>\n",
       "      <td>0.818448</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>10.590000</td>\n",
       "      <td>12.410000</td>\n",
       "      <td>0.808100</td>\n",
       "      <td>4.899000</td>\n",
       "      <td>2.630000</td>\n",
       "      <td>0.765100</td>\n",
       "      <td>4.519000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>12.270000</td>\n",
       "      <td>13.450000</td>\n",
       "      <td>0.856900</td>\n",
       "      <td>5.262250</td>\n",
       "      <td>2.944000</td>\n",
       "      <td>2.561500</td>\n",
       "      <td>5.045000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>14.355000</td>\n",
       "      <td>14.320000</td>\n",
       "      <td>0.873450</td>\n",
       "      <td>5.523500</td>\n",
       "      <td>3.237000</td>\n",
       "      <td>3.599000</td>\n",
       "      <td>5.223000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>17.305000</td>\n",
       "      <td>15.715000</td>\n",
       "      <td>0.887775</td>\n",
       "      <td>5.979750</td>\n",
       "      <td>3.561750</td>\n",
       "      <td>4.768750</td>\n",
       "      <td>5.877000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>21.180000</td>\n",
       "      <td>17.250000</td>\n",
       "      <td>0.918300</td>\n",
       "      <td>6.675000</td>\n",
       "      <td>4.033000</td>\n",
       "      <td>8.456000</td>\n",
       "      <td>6.550000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                0           1           2           3           4           5  \\\n",
       "count  210.000000  210.000000  210.000000  210.000000  210.000000  210.000000   \n",
       "mean    14.847524   14.559286    0.870999    5.628533    3.258605    3.700201   \n",
       "std      2.909699    1.305959    0.023629    0.443063    0.377714    1.503557   \n",
       "min     10.590000   12.410000    0.808100    4.899000    2.630000    0.765100   \n",
       "25%     12.270000   13.450000    0.856900    5.262250    2.944000    2.561500   \n",
       "50%     14.355000   14.320000    0.873450    5.523500    3.237000    3.599000   \n",
       "75%     17.305000   15.715000    0.887775    5.979750    3.561750    4.768750   \n",
       "max     21.180000   17.250000    0.918300    6.675000    4.033000    8.456000   \n",
       "\n",
       "                6           7  \n",
       "count  210.000000  210.000000  \n",
       "mean     5.408071    1.000000  \n",
       "std      0.491480    0.818448  \n",
       "min      4.519000    0.000000  \n",
       "25%      5.045000    0.000000  \n",
       "50%      5.223000    1.000000  \n",
       "75%      5.877000    2.000000  \n",
       "max      6.550000    2.000000  "
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11.9144</td>\n",
       "      <td>13.2615</td>\n",
       "      <td>0.8502</td>\n",
       "      <td>5.2312</td>\n",
       "      <td>2.8639</td>\n",
       "      <td>5.0683</td>\n",
       "      <td>5.1053</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18.9630</td>\n",
       "      <td>16.3967</td>\n",
       "      <td>0.8860</td>\n",
       "      <td>6.2427</td>\n",
       "      <td>3.7499</td>\n",
       "      <td>3.5403</td>\n",
       "      <td>6.1008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>13.3911</td>\n",
       "      <td>13.8652</td>\n",
       "      <td>0.8742</td>\n",
       "      <td>5.3660</td>\n",
       "      <td>3.1143</td>\n",
       "      <td>2.4158</td>\n",
       "      <td>4.9907</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15.7191</td>\n",
       "      <td>14.9766</td>\n",
       "      <td>0.8804</td>\n",
       "      <td>5.7452</td>\n",
       "      <td>3.3985</td>\n",
       "      <td>3.1870</td>\n",
       "      <td>5.4484</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         0        1       2       3       4       5       6\n",
       "0  11.9144  13.2615  0.8502  5.2312  2.8639  5.0683  5.1053\n",
       "1  18.9630  16.3967  0.8860  6.2427  3.7499  3.5403  6.1008\n",
       "2  13.3911  13.8652  0.8742  5.3660  3.1143  2.4158  4.9907\n",
       "3  15.7191  14.9766  0.8804  5.7452  3.3985  3.1870  5.4484"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "kmeans = KMeans(n_clusters=4, n_init=12)\n",
    "kmeans_result = kmeans.fit_predict(data.iloc[:, :-1])\n",
    "kmeans.labels_\n",
    "pd.DataFrame(data=np.around(kmeans.cluster_centers_, decimals=4), columns=data.columns[:-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>KMeans</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15.26</td>\n",
       "      <td>14.84</td>\n",
       "      <td>0.8710</td>\n",
       "      <td>5.763</td>\n",
       "      <td>3.312</td>\n",
       "      <td>2.221</td>\n",
       "      <td>5.220</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14.88</td>\n",
       "      <td>14.57</td>\n",
       "      <td>0.8811</td>\n",
       "      <td>5.554</td>\n",
       "      <td>3.333</td>\n",
       "      <td>1.018</td>\n",
       "      <td>4.956</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>14.29</td>\n",
       "      <td>14.09</td>\n",
       "      <td>0.9050</td>\n",
       "      <td>5.291</td>\n",
       "      <td>3.337</td>\n",
       "      <td>2.699</td>\n",
       "      <td>4.825</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13.84</td>\n",
       "      <td>13.94</td>\n",
       "      <td>0.8955</td>\n",
       "      <td>5.324</td>\n",
       "      <td>3.379</td>\n",
       "      <td>2.259</td>\n",
       "      <td>4.805</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16.14</td>\n",
       "      <td>14.99</td>\n",
       "      <td>0.9034</td>\n",
       "      <td>5.658</td>\n",
       "      <td>3.562</td>\n",
       "      <td>1.355</td>\n",
       "      <td>5.175</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0      1       2      3      4      5      6  7  KMeans\n",
       "0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220  0       3\n",
       "1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956  0       2\n",
       "2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825  0       2\n",
       "3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805  0       2\n",
       "4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175  0       3"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kmeans_data = data.copy()\n",
    "kmeans_data['KMeans'] = kmeans.labels_\n",
    "kmeans_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>KMeans</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15.26</td>\n",
       "      <td>14.84</td>\n",
       "      <td>0.8710</td>\n",
       "      <td>5.763</td>\n",
       "      <td>3.312</td>\n",
       "      <td>2.221</td>\n",
       "      <td>5.220</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14.88</td>\n",
       "      <td>14.57</td>\n",
       "      <td>0.8811</td>\n",
       "      <td>5.554</td>\n",
       "      <td>3.333</td>\n",
       "      <td>1.018</td>\n",
       "      <td>4.956</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>14.29</td>\n",
       "      <td>14.09</td>\n",
       "      <td>0.9050</td>\n",
       "      <td>5.291</td>\n",
       "      <td>3.337</td>\n",
       "      <td>2.699</td>\n",
       "      <td>4.825</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13.84</td>\n",
       "      <td>13.94</td>\n",
       "      <td>0.8955</td>\n",
       "      <td>5.324</td>\n",
       "      <td>3.379</td>\n",
       "      <td>2.259</td>\n",
       "      <td>4.805</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16.14</td>\n",
       "      <td>14.99</td>\n",
       "      <td>0.9034</td>\n",
       "      <td>5.658</td>\n",
       "      <td>3.562</td>\n",
       "      <td>1.355</td>\n",
       "      <td>5.175</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0      1       2      3      4      5      6  7  KMeans\n",
       "0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220  0       0\n",
       "1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956  0       0\n",
       "2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825  0       0\n",
       "3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805  0       0\n",
       "4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175  0       0"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 大类匹配\n",
    "# 取预测类中真实类型的最大值为真实类\n",
    "mapped = {}\n",
    "for pre_num in range(4):\n",
    "    predict_true_nums = list(set(kmeans_data[kmeans_data['KMeans'] == pre_num][7]))\n",
    "    tomap = list(map(lambda x: list(kmeans_data[kmeans_data['KMeans'] == pre_num][7]).count(x), set(kmeans_data[kmeans_data['KMeans'] == pre_num][7])))\n",
    "    mapped[pre_num] = predict_true_nums[tomap.index(max(tomap))]\n",
    "kmeans_data['KMeans'] = kmeans_data['KMeans'].replace(mapped)\n",
    "kmeans_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0, -1,\n",
       "        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,\n",
       "        0,  0,  0, -1,  0, -1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,\n",
       "       -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,\n",
       "        0,  0,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  2,  3, -1,  2,\n",
       "        2,  2,  2,  3,  3, -1,  2,  2, -1,  2,  2,  2,  2,  2,  2,  2,  2,\n",
       "        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, -1,  3,  2,  2,  2,  2,\n",
       "        2, -1,  2,  2,  2,  0,  2,  2,  2, -1, -1,  2,  2,  1,  2,  1,  0,\n",
       "        2,  0,  0,  2,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,\n",
       "        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,\n",
       "        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,\n",
       "        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,\n",
       "        0,  0,  0, -1,  0,  0])"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cluster import DBSCAN\n",
    "\n",
    "dbscan = DBSCAN(eps=0.9, min_samples=4)\n",
    "dbscan_result = dbscan.fit_predict(data.iloc[:, :-1])\n",
    "dbscan.labels_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'clusters = 4, noise = 15'"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clusters = len(set(dbscan.labels_)) - abs(dbscan.labels_.min())\n",
    "noise = list(dbscan.labels_).count(-1)\n",
    "f'clusters = {clusters}, noise = {noise}'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>DBSCAN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15.26</td>\n",
       "      <td>14.84</td>\n",
       "      <td>0.8710</td>\n",
       "      <td>5.763</td>\n",
       "      <td>3.312</td>\n",
       "      <td>2.221</td>\n",
       "      <td>5.220</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14.88</td>\n",
       "      <td>14.57</td>\n",
       "      <td>0.8811</td>\n",
       "      <td>5.554</td>\n",
       "      <td>3.333</td>\n",
       "      <td>1.018</td>\n",
       "      <td>4.956</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>14.29</td>\n",
       "      <td>14.09</td>\n",
       "      <td>0.9050</td>\n",
       "      <td>5.291</td>\n",
       "      <td>3.337</td>\n",
       "      <td>2.699</td>\n",
       "      <td>4.825</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13.84</td>\n",
       "      <td>13.94</td>\n",
       "      <td>0.8955</td>\n",
       "      <td>5.324</td>\n",
       "      <td>3.379</td>\n",
       "      <td>2.259</td>\n",
       "      <td>4.805</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16.14</td>\n",
       "      <td>14.99</td>\n",
       "      <td>0.9034</td>\n",
       "      <td>5.658</td>\n",
       "      <td>3.562</td>\n",
       "      <td>1.355</td>\n",
       "      <td>5.175</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0      1       2      3      4      5      6  7  DBSCAN\n",
       "0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220  0       0\n",
       "1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956  0       0\n",
       "2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825  0       0\n",
       "3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805  0       0\n",
       "4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175  0       0"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dbscan_data = data.copy()\n",
    "dbscan_data['DBSCAN'] = dbscan.labels_\n",
    "dbscan_data = dbscan_data[dbscan_data['DBSCAN'] >= 0]\n",
    "dbscan_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[64, 4, 66]\n",
      "[2, 2]\n",
      "[52]\n",
      "[5]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>DBSCAN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15.26</td>\n",
       "      <td>14.84</td>\n",
       "      <td>0.8710</td>\n",
       "      <td>5.763</td>\n",
       "      <td>3.312</td>\n",
       "      <td>2.221</td>\n",
       "      <td>5.220</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14.88</td>\n",
       "      <td>14.57</td>\n",
       "      <td>0.8811</td>\n",
       "      <td>5.554</td>\n",
       "      <td>3.333</td>\n",
       "      <td>1.018</td>\n",
       "      <td>4.956</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>14.29</td>\n",
       "      <td>14.09</td>\n",
       "      <td>0.9050</td>\n",
       "      <td>5.291</td>\n",
       "      <td>3.337</td>\n",
       "      <td>2.699</td>\n",
       "      <td>4.825</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13.84</td>\n",
       "      <td>13.94</td>\n",
       "      <td>0.8955</td>\n",
       "      <td>5.324</td>\n",
       "      <td>3.379</td>\n",
       "      <td>2.259</td>\n",
       "      <td>4.805</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16.14</td>\n",
       "      <td>14.99</td>\n",
       "      <td>0.9034</td>\n",
       "      <td>5.658</td>\n",
       "      <td>3.562</td>\n",
       "      <td>1.355</td>\n",
       "      <td>5.175</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0      1       2      3      4      5      6  7  DBSCAN\n",
       "0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220  0       2\n",
       "1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956  0       2\n",
       "2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825  0       2\n",
       "3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805  0       2\n",
       "4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175  0       2"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 大类匹配\n",
    "# 取预测类中真实类型的最大值为真实类\n",
    "mapped = {}\n",
    "for pre_num in range(4):\n",
    "    predict_true_nums = list(set(dbscan_data[dbscan_data['DBSCAN'] == pre_num][7]))\n",
    "    tomap = list(map(lambda x: list(dbscan_data[dbscan_data['DBSCAN'] == pre_num][7]).count(x), set(dbscan_data[dbscan_data['DBSCAN'] == pre_num][7])))\n",
    "    mapped[pre_num] = predict_true_nums[tomap.index(max(tomap))]\n",
    "    print(tomap)\n",
    "dbscan_data['DBSCAN'] = dbscan_data['DBSCAN'].replace(mapped)\n",
    "dbscan_data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8666666666666667 0.6410256410256411\n",
      "0.8692441492726123 0.5557142857142857\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score, accuracy_score\n",
    "\n",
    "print(accuracy_score(kmeans_data[7], kmeans_data['KMeans']), accuracy_score(dbscan_data[7], dbscan_data['DBSCAN']))\n",
    "print(f1_score(kmeans_data[7], kmeans_data['KMeans'], average='macro'), f1_score(dbscan_data[7], dbscan_data['DBSCAN'], average='macro'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.6 ('env': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "e6a44bc4ae6224bc4bc7b3a89841677b0124c79fe22f3d101cb1ffa8756f30f3"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
